{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34e669bf-e1ab-4122-8a72-ae255fbf9437",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from lxml import etree\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml import html\n",
    "import string\n",
    "import requests_file\n",
    "import time\n",
    "import glob, os\n",
    "import time\n",
    "import re\n",
    "from datetime import datetime\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "27e670ad-741e-48ed-8d71-e3b1cb3f7074",
   "metadata": {},
   "outputs": [],
   "source": [
    "def days_between(d1, d2):\n",
    "    '''\n",
    "    This function computes the duration between two dates\n",
    "    '''\n",
    "    d1 = datetime.strptime(d1, \"%x\")\n",
    "    d2 = datetime.strptime(d2, \"%x\")\n",
    "    return abs((d2 - d1).days)\n",
    "\n",
    "def cc_roa_parser(filename,path):\n",
    "    '''\n",
    "    This function aims to identify the reschedulings in 7th Circuit Court ROAs.\n",
    "    '''\n",
    "    case_number = filename.strip('.html')\n",
    "    file = path+\"/\"+filename\n",
    "    with open(file, \"r\") as f:\n",
    "        page = f.read()\n",
    "    html = etree.HTML(page)\n",
    "    entries = html.xpath('.//table[@id=\"dlROA\"]//tr/td/span/text()')\n",
    "    records = pd.DataFrame()\n",
    "    cleaned_entries = []\n",
    "    for e in entries:\n",
    "        cleaned_entries.append(e.replace(u'\\xa0', u' ').strip())\n",
    "    for ce in cleaned_entries:\n",
    "        if 'Actions, Judgments, Case Notes' in ce:\n",
    "            index = cleaned_entries.index(ce)\n",
    "            \n",
    "    actions = [case_number]\n",
    "    for ce in cleaned_entries[index+4:-1]:\n",
    "        actions.append(ce)\n",
    "            \n",
    "    remove_list=[]\n",
    "    for a in actions:\n",
    "        if 'REMOVE NEXT' in a:\n",
    "            exluded = [i for i in actions[actions.index(a)+1:] if i[:1].isdigit()]\n",
    "            try:\n",
    "                exclude_index = actions.index(exluded[1])\n",
    "                potential_removal = actions[actions.index(a):exclude_index]\n",
    "                if any(\"ADJ\" in pr for pr in potential_removal):\n",
    "                    remove_list.append(potential_removal)\n",
    "            except:\n",
    "                potential_removal = actions[actions.index(a):]\n",
    "                if any(\"ADJ\" in pr for pr in potential_removal):\n",
    "                    remove_list.append(potential_removal)\n",
    "                    \n",
    "    if len(remove_list)>0:\n",
    "        for rl in remove_list:\n",
    "            removed_dates = {}\n",
    "            notice_date = re.findall(\"\\d \\d{2}\\/\\d{2}\\/\\d{2}\", rl[0])[0].split(\" \")[1]\n",
    "            old_date = re.findall(\"\\\\bREMOVE NEXT EVENT: \\d{2}\\/\\d{2}\\/\\d{2}\", rl[0])[0].split(\"REMOVE NEXT EVENT: \")[1]\n",
    "            event = rl[1]\n",
    "            if \"NOTICE SENT FOR\" in rl[2] or \"SET NEXT DATE\" in rl[2]:\n",
    "                new_date = re.findall(\"\\d{2}\\/\\d{2}\\/\\d{2}\", rl[2])[0]\n",
    "            elif \"NOTICE SENT FOR\" in rl[3] or \"SET NEXT DATE\" in rl[3]:\n",
    "                new_date = re.findall(\"\\d{2}\\/\\d{2}\\/\\d{2}\", rl[3])[0]\n",
    "                \n",
    "            for r in rl:\n",
    "                if \"COVID\" in r or \"covid\"  in r or 'STATE OF EMERGENCY' in r:\n",
    "                    cause = 'COVID19'\n",
    "                elif \"DUE TO\" in r:\n",
    "                    if len(re.findall(\"\\\\bDUE TO [a-zA-Z]\", r))>0:\n",
    "                        try:\n",
    "                            cause = r.split(\"DUE TO\")[1] + rl[rl.index(r)+1]\n",
    "                        except:\n",
    "                            pass\n",
    "                    else:\n",
    "                        cause = rl[rl.index(r):]\n",
    "                elif \"PER\" in r:\n",
    "                    if len(re.findall(\"\\\\bPER [a-zA-Z]\", r))>0:\n",
    "                        try:\n",
    "                            cause = r.split(\"PER\")[1]+rl[rl.index(r)+1]\n",
    "                        except:\n",
    "                            pass\n",
    "                    else:\n",
    "                        cause = rl[rl.index(r):]\n",
    "                elif \"COURT UNAVAILABLE\" in r or \"court unavailable\" in r:\n",
    "                    cause = \"COURT UNAVAILABLE\"\n",
    "                elif \"JUDGE UNAVAILABLE\" in r or \"judge unavailable\" in r:\n",
    "                    cause = \"JUDGE UNAVAILABLE\"\n",
    "                        \n",
    "            \n",
    "            duration = days_between(old_date, new_date)\n",
    "            \n",
    "            removed_dates[\"case_number\"]=case_number\n",
    "            removed_dates[\"event\"] = event \n",
    "            removed_dates[\"notice_date\"] = notice_date\n",
    "            removed_dates[\"old_dates\"] = old_date\n",
    "            removed_dates[\"new_date\"] = new_date\n",
    "            removed_dates[\"duration\"] = duration\n",
    "            \n",
    "            try:\n",
    "                removed_dates[\"cause\"] = cause\n",
    "            except:\n",
    "                removed_dates[\"cause\"] = '.'\n",
    "            \n",
    "            removed_dates = pd.DataFrame([removed_dates], \n",
    "                                         columns=removed_dates.keys())\n",
    "            records = pd.concat([records,removed_dates])\n",
    "    return(records)\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c3762da-694e-4c2d-bd4f-9b9654bba691",
   "metadata": {},
   "outputs": [],
   "source": [
    "path = \"C:/Users/\"+os.getlogin()+\"/Dropbox/IGNITE/2input_data/C07/3ROAs\"\n",
    "files=[]\n",
    "for file in os.listdir(path):\n",
    "    if file.endswith('.html'):\n",
    "        files.append(file)\n",
    "\n",
    "data = cc_roa_parser(files[0],path)\n",
    "for f in files[1:]:\n",
    "    data = pd.concat([data,cc_roa_parser(f,path)])\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af7f5696-5856-436b-9bb4-7765361a9a0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_excel(\"C:/Users/\"+os.getlogin()+\"/Dropbox/IGNITE/3output_data/C07/circuit_removes.xlsx\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
